# display plots inline
%matplotlib notebook
# imports
import os
import numpy as np
import pandas as pd
import pymc3 as pm
from bambi import Model, Prior
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
import pymc3_utils as pmu
# suppress system warnings for legibility
import warnings
warnings.filterwarnings('ignore')
# resize plots to fit labels inside bounding box
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})
# MPI color scheme
sns.set(style='white', palette='Set2')
# start by loading the data
df_ravens = pd.read_csv('data/ravens.tsv', sep='\t').dropna()
df_reading = pd.read_csv('data/tamil_reading.tsv', sep='\t').drop(columns=['literate', 'subject'])
df_ravens = df_ravens.merge(df_reading, left_on='pp', right_on='pp')
display(df_ravens.head())
# standardize reading score
df_ravens['reading_z'] = pmu.standardize(df_ravens['word'])
display(df_ravens.head())
display(df_ravens.groupby('literate').mean())
display(df_ravens.groupby('literate').std())
First we will set some parameters for the regression procedure, as outlined in the paper.
# Default model params
defaults = {
'samples': 5000,
'tune': 2500,
'chains': 4,
'init': 'advi+adapt_diag',
'family': 'bernoulli',
'priors': {'fixed': 'narrow', 'random': 'narrow'},
}
We are modelling each question in the ravens task as a Bernoulli trial (i.e., a generalized linear model). We will run the only model that works in the context of our multilevel model setup: A model with no fixed effects, only an overall intercept and by-participant and by-item intercepts.
model_ravens_intercept = Model(df_ravens)
model_ravens_intercept.fit('ACC ~ 1',
random=['1|item', '1|pp'],
**defaults)
display(pm.summary(model_ravens_intercept.backend.trace).round(2))
$\hat{r}$ values look good, as does the number of effective samples for the by-participant intercepts. Even for the overall intercept we have 8.5K effective samples.
As an additional check, we will plot the posterior traces for the selected model. Ideally these look unimodal and roughly normal. The plots on the righthand side should look like fuzzy caterpillars.
g_traces = pm.traceplot(model_ravens_intercept.backend.trace)
plt.savefig('figures/ravens_model_traces.png', dpi=600)
We will now extract and store the posterior means of the participant intercepts so we can use them to model reading scores.
pps = df_ravens['pp'].unique()
pp_nums = [f'1|pp[{i}]' for i in range(len(pps))]
df_intercepts = pm.summary(model_ravens_intercept.backend.trace).loc[pp_nums]
df_intercepts['pp'] = np.sort(pps)
display(df_intercepts.head().round(2))
df_uncorrected = df_ravens.groupby('pp', as_index=False).mean().rename(columns={'ACC': 'raw_ravens_mean'})
df_intercepts = df_intercepts[['pp', 'mean']].rename(columns={'mean': 'ravens_intercept'})
df_intercepts = df_intercepts.merge(df_uncorrected[['pp', 'reading_z', 'raw_ravens_mean']],
left_on='pp', right_on='pp').reset_index()
display(df_intercepts.head().round(2))
# and write to file
df_intercepts.to_csv('data/ravens_intercepts.tsv', sep='\t')
Before closing this notebook, we will take a quick look at the correlations between ravens score and reading score.
display(df_intercepts[['raw_ravens_mean', 'ravens_intercept', 'reading_z']].corr().round(2))
Apparently it makes no difference whether we use the participant intercepts from our span model or the mean of forward and backward digit span for each individual participant, since the correlation between these variables is nearly 1.